2. Word2Vec


In [ ]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib

In [ ]:
matplotlib.use("TkAgg")

In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

Dataset

  • Julius Caesar
  • Macbeth

In [ ]:
data_dir = '../data/'

In [ ]:
macbeth_file = data_dir + 'macbeth.txt'

In [ ]:
caesar_file = data_dir + 'julius_caesar.txt'

Remove the stopwords


In [ ]:
stopword_file = data_dir + 'long_stopwords.txt'

In [ ]:
stop_words = []

with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp)

In [ ]:
stop_words

In [ ]:
type(stop_words)

In [ ]:
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''

In [ ]:
clean("king's")

In [ ]:
clean("they'll")

In [ ]:
line_count = 0
sentences = []

with open(macbeth_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)
            
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)

In [ ]:
type(sentences)

Word2Vec model


In [ ]:
model = Word2Vec(sentences, window=5, size=500, workers=4, min_count=5)

In [ ]:
model.vocab

In [ ]:
labels = []
tokens = []

for word in model.vocab:
    tokens.append(model[word])
    labels.append(word)

TSNE plot to find the similarity of words


In [ ]:
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

In [ ]:
new_values = tsne_model.fit_transform(tokens)

In [ ]:
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

In [ ]:
plt.figure(figsize=(16, 12)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()

Analogies


In [ ]:
model.most_similar(positive=['caesar','duncan'],negative=['scotland'])

In [ ]:
model.most_similar(positive=['caesar','duncan'],negative=['macbeth'])

In [ ]:
model.most_similar(positive=['caesar','macbeth'],negative=['banquo'])

In [ ]:
model.most_similar(positive=['rome','scotland'],negative=['banquo'])

In [ ]:
model.doesnt_match("duncan macbeth scotland banquo".split())

In [ ]: